{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "dd1b7983",
   "metadata": {},
   "outputs": [],
   "source": [
    "# autoreload\n",
    "%load_ext autoreload\n",
    "%autoreload 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a4d7ff82",
   "metadata": {},
   "outputs": [],
   "source": [
    "import praw\n",
    "import prawcore\n",
    "from bs4 import BeautifulSoup as bs\n",
    "import pandas as pd\n",
    "from tqdm import tqdm\n",
    "import pickle\n",
    "from random import sample, choice\n",
    "import concurrent.futures"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8f191d54",
   "metadata": {},
   "source": [
    "# List of subreddits to scrape\n",
    "\n",
    "this list was build from https://anvaka.github.io/redsim. Can be used to expand the list of favourable subreddits.\n",
    "\n",
    "taking these for now"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f2432ba1",
   "metadata": {},
   "outputs": [],
   "source": [
    "subs = [\n",
    "    \"Sexpolls\",\n",
    "    \"sexpositions\",\n",
    "    \"Sexconfessional\",\n",
    "    \"penissize\",\n",
    "    \"masturbation\",\n",
    "    \"AskRedditNSFW\",\n",
    "    \"sexstories\",\n",
    "    \"SexFantasies\",\n",
    "    \"sexconfession\",\n",
    "    \"askwoman_aboutsex\",\n",
    "    \"sextips\",\n",
    "    \"sexualhealth\",\n",
    "    \"SexPositive\",\n",
    "    \"DirtyConfession\",\n",
    "    \"Puberty\",\n",
    "    \"NSFWIAMA\",\n",
    "    \"sexover30\",\n",
    "    \"SexToys\",\n",
    "    \"sexquestions\",\n",
    "    \"deepvaginaproblems\",\n",
    "    \"kegels\",\n",
    "    \"sexeducation\",\n",
    "    \"ColoredLang\",\n",
    "    \"masterbationstories\",\n",
    "    \"RedditAfterDark\",\n",
    "    \"Threesome_advice\",\n",
    "]\n",
    "\n",
    "# \"NSFWIAMA\" -> special one (TODO f)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f34c31f0",
   "metadata": {},
   "source": [
    "# Scrap these Subreddits"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3e2ff9cb",
   "metadata": {},
   "outputs": [],
   "source": [
    "from utils import scrap_subreddit, init_praw_reddit, save_to_huggingface"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8507c27e",
   "metadata": {},
   "outputs": [],
   "source": [
    "reddit = init_praw_reddit()\n",
    "\n",
    "# loop through subreddits and save to an intermediate folder\n",
    "for sub in tqdm(subs):\n",
    "    try:\n",
    "        df = scrape_subreddit(sub, reddit)\n",
    "        if df is not None:\n",
    "            file_name = f\"dataframes/{sub}.csv\"\n",
    "            df.to_csv(file_name, index=False)\n",
    "            print(\"subreddit saved to: \", file_name)\n",
    "    except Exception as e:\n",
    "        logger.error(f\"Error scraping {sub}: {e}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "56ae9ee8",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "\n",
    "# clean the results\n",
    "files = os.listdir(\"dataframes/\")\n",
    "dfs = []\n",
    "for file in files:\n",
    "    df = pd.read_csv(f\"dataframes/{file}\")\n",
    "    dfs.append(df[df[\"is_question\"] & df[\"is_self\"]])\n",
    "\n",
    "cleaned_df = pd.concat(dfs)\n",
    "print(cleaned_df.shape, cleaned_df[\"subreddit\"].value_counts())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "id": "c90aca1c",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Downloading and preparing dataset json/default to /home/jjmachan/.cache/huggingface/datasets/json/default-9433ebf74f6f4bdb/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51...\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "b626ca7617204ce08f24581a66c91ee2",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "806b00cb90e243d1935508a45a039642",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Generating train split: 0 examples [00:00, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Dataset json downloaded and prepared to /home/jjmachan/.cache/huggingface/datasets/json/default-9433ebf74f6f4bdb/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51. Subsequent calls will reuse this data.\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "857341544c4c42a1bac2a23797613f66",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/1 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Pushing split train to the Hub.\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "122619576a37463f97d1189491823b0b",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "e77487d84f824c8aaedfaf1d31acaf52",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Creating parquet from Arrow format:   0%|          | 0/13 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "save_to_huggingface(cleaned_df, name=\"jjmachan/NSFW-questions-inter-cleaned_df\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "11e7546e",
   "metadata": {},
   "source": [
    "# Filter with prosocial\n",
    "\n",
    "The prosocial.ipynb has the code that converts the `clean_df` intermediate dataset stored in `jjmachan/NSFW-questions-inter-cleaned_df` "
   ]
  },
  {
   "cell_type": "markdown",
   "id": "bd5bb6b2",
   "metadata": {},
   "source": [
    "# Get Comments"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e81c6432",
   "metadata": {},
   "outputs": [],
   "source": [
    "get_comments(cleaned_df[\"post_id\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "id": "5bb39fe6",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(12269, 14) (1442, 6) (1442, 14)\n"
     ]
    }
   ],
   "source": [
    "nsfw_with_comments = pd.read_csv(\"df_with_comments.csv\").drop_duplicates(subset=[\"post_id\"])\n",
    "nsfw = pd.read_csv(\"nsfw.csv\").drop_duplicates(subset=[\"post_id\"])\n",
    "nsfw_final = pd.merge(nsfw.drop(columns=[f\"C{i+1}\" for i in range(5)]), nsfw_with_comments, on=\"post_id\")\n",
    "print(nsfw.shape, nsfw_with_comments.shape, nsfw_final.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "652869f9",
   "metadata": {},
   "outputs": [],
   "source": [
    "nsfw_final.sample(15)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1bccbcba",
   "metadata": {},
   "source": [
    "# Save to Huggingface"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "id": "44d44cc1",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Downloading and preparing dataset json/default to /home/jjmachan/.cache/huggingface/datasets/json/default-424060f196a2282b/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51...\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "e4f0e24800d84b779251d47e728d8c38",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "a40c813b9b8f4553bdc5d0b0627ffbf6",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Generating train split: 0 examples [00:00, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Dataset json downloaded and prepared to /home/jjmachan/.cache/huggingface/datasets/json/default-424060f196a2282b/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51. Subsequent calls will reuse this data.\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "f1de786d57c840f3948a75335a5472cf",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/1 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Pushing split train to the Hub.\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "c703b1721fa8405092cec72399712f2a",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "06865da3d5324a95ab32c4f78de0cf4f",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "146ab5efdf7343ab9ad8f3bc93a0ffaa",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "from utils import save_to_huggingface\n",
    "\n",
    "save_to_huggingface(nsfw_final, name=\"jjmachan/NSFW-questions\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
